list.of.packages <- c("reshape2","dplyr", "tidyr", "readr", "stringr", "plotly") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
load(here::here("analyses", "meth_filter"))
meth_filter_reshaped <- melt(data=meth_filter, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
tidyr::separate(col = "variable" , into = c("variable", "sample"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
dcast(chr+start+end+strand+sample ~ variable) %>%
mutate(percMeth = 100*(numCs/coverage))
readr::write_delim(meth_filter_reshaped, "../analyses/meth_filter.tab", delim = '\t', col_names = FALSE)
wb: Print all lines in the second file a: input data file, which was created in previous code chunk b: save annotated gene list
intersectBed \
-wb \
-a "../analyses/meth_filter.tab" \
-b "../genome-features/Olurida_v081-20190709.gene.gff" \
> "../analyses/meth_filter_gene.tab"
wc -l "../analyses/meth_filter_gene.tab"
## 69984 ../analyses/meth_filter_gene.tab
– gene = contig number + start/end locus – group = sample number + gene – population = HC for Hood Canal, or SS for South Sound
library(car)
meth_filter_genes <-
read_delim(file = here::here("analyses", "meth_filter_gene.tab"), delim = "\t", col_names = c(colnames(meth_filter_reshaped), "contig_gene", "source_gene", "feature_gene", "start_gene", "end_gene", "unknown1_gene", "strand_gene", "unknown2_gene", "notes_gene")) %>%
mutate(gene=paste(contig_gene, start_gene, end_gene, sep="_")) %>%
mutate(group=paste(sample, gene, sep="_")) %>%
mutate(population=as.character(sample))
## Parsed with column specification:
## cols(
## chr = col_character(),
## start = col_double(),
## end = col_double(),
## strand = col_character(),
## sample = col_double(),
## coverage = col_double(),
## numCs = col_double(),
## numTs = col_double(),
## percMeth = col_double(),
## contig_gene = col_character(),
## source_gene = col_character(),
## feature_gene = col_character(),
## start_gene = col_double(),
## end_gene = col_double(),
## unknown1_gene = col_character(),
## strand_gene = col_character(),
## unknown2_gene = col_character(),
## notes_gene = col_character()
## )
meth_filter_genes$population <- car::recode(meth_filter_genes$population, "c('1', '2', '3', '4', '5', '6', '7', '8', '9')='HC'")
meth_filter_genes$population <- car::recode(meth_filter_genes$population, "c('10','11','12','13','14','15','16','17','18')='SS'")
length(unique(meth_filter_genes$gene))
## [1] 1379
min.filt <- count(meth_filter_genes, vars = c( group))
newdata <- min.filt[ which(min.filt$n > 4), ]
sub_meth_table <- meth_filter_genes[meth_filter_genes$group %in% newdata$vars,]
length(unique(sub_meth_table$gene))
## [1] 204
Note: this script was created by Hollie Putnam (GM.Rmd); there are minor revisions below. I retained some commented out lines (notably-testing for position w/n gene, such as intron & exon) in case we want to include those as factors in the future.
# create data frame to stored results
results <- data.frame()
gs <- unique(sub_meth_table$gene)
#first subset the unique dataframes and second run the GLMs
for(i in 1:length(gs)){
#subset the dataframe gene by gene
sub_meth_table1 <- subset(sub_meth_table, gene ==gs[i])
# fit glm position model
fit <- glm(matrix(c(numCs, numTs), ncol=2) ~ as.factor(population),
data=sub_meth_table1, family=binomial)
a <- anova(fit, test="Chisq")
# capture summary stats to data frame
df <- data.frame(sub_meth_table1[c("population", "sample", "contig_gene", "start_gene", "end_gene", "gene", "chr", "start", "sample", "coverage", "numCs", "numTs", "percMeth", "notes_gene")],
pval.treatment = a$`Pr(>Chi)`[2],
#pval.position = a$`Pr(>Chi)`[3], #uncomment if you want to include position of CpG within a gene
#pval.treatment_x_position = a$`Pr(>Chi)`[4], #uncomment if you want to include position of CpG within a gene interaction with treatment
stringsAsFactors = F)
# bind rows of temporary data frame to the results data frame
results <- rbind(results, df)
}
results[is.na(results)] <- 0
results$adj.pval.pop <- p.adjust(results$pval.treatment, method='BH')
#results$adj.pval.position <- p.adjust(results$pval.position, method='BH') #uncomment if you want to include position of CpG within a gene
#results$adj.pval.treatment_x_position <- p.adjust(results$pval.treatment_x_position, method='BH') #uncomment if you want to include position of CpG within a gene interaction with treatment
DMGs <- subset(results, adj.pval.pop < 0.05) #safe df with differentially methylated genes
DMGs.genes <- DMGs[!duplicated(DMGs$gene), c("contig_gene", "start_gene", "end_gene", "notes_gene", "pval.treatment", "adj.pval.pop")]
length(unique(DMGs$gene))
## [1] 46
# split gene data in "notes_gene" column into separate columns
DMGs.genes <- DMGs.genes %>%
mutate(ID=str_extract(notes_gene, "ID=(.*?);"),
Parent=str_extract(notes_gene, "Parent=(.*?);"),
Name=str_extract(notes_gene, "Name=(.*?);"),
Alias=str_extract(notes_gene, "Alias=(.*?);"),
AED=str_extract(notes_gene, "AED=(.*?);"),
eAED=str_extract(notes_gene, "eAED=(.*?);"),
Note=str_extract(notes_gene, "Note=(.*?);"),
Ontology_term=str_extract(notes_gene, "Ontology_term=(.*?);"),
Dbxref=str_extract(notes_gene, "Dbxref=(.*?);")
)
#Extract GO terms
DMGs.genes.GO <- DMGs.genes %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern="Ontology_term=",replacement = "")) %>%
mutate(Ontology_term = str_replace(Ontology_term, pattern=";",replacement = "")) %>%
separate(Ontology_term, sep=",", into=paste("GO", 1:11, sep="_")) %>%
pivot_longer(cols=c("GO_1","GO_2","GO_3","GO_4","GO_5","GO_6","GO_7","GO_8","GO_9","GO_10","GO_11"), names_to = "GO_number", values_to = "GO_term") %>%
dplyr::select(-GO_number) %>%
filter(!is.na(Note) & !is.na(GO_term))
## Warning: Expected 11 pieces. Missing pieces filled with `NA` in 17 rows [5, 7,
## 8, 9, 13, 14, 15, 16, 18, 22, 26, 33, 38, 42, 44, 45, 46].
write_delim(DMGs.genes.GO[,c("GO_term","adj.pval.pop")], path = here::here("analyses/", "DMRs.GO.txt"), delim = '\t', col_names = F) #write out df with just GO terms and p-adj values
| term ID | description | frequency | pin? | log10 p-value | uniqueness | dispensability |
|---|---|---|---|---|---|---|
| GO:0006468 | protein phosphorylation | 4.137 % | -3.7877 | 0.40 | 0.00 | |
| GO:0006807 | nitrogen compound metabolic process | 38.744 % | -2.2764 | 0.78 | 0.03 | |
| GO:0006207 | ‘de novo’ pyrimidine nucleobase biosynthetic process | 0.192 % | -2.2764 | 0.46 | 0.06 | |
| GO:0006281 | DNA repair | 2.234 % | -2.4853 | 0.50 | 0.20 | |
| GO:0006030 | chitin metabolic process | 0.077 % | -1.6311 | 0.49 | 0.21 | |
| GO:0006520 | cellular amino acid metabolic process | 5.591 % | -2.2764 | 0.42 | 0.35 | |
| GO:0006412 | translation | 5.686 % | -2.4853 | 0.28 | 0.55 | |
| GO:0016567 | protein ubiquitination | 0.523 % | -1.4336 | 0.44 | 0.56 |
# A plotting R script produced by the REVIGO server at http://revigo.irb.hr/
# If you found REVIGO useful in your work, please cite the following reference:
# Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology
# terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800
# --------------------------------------------------------------------------
# If you don't have the ggplot2 package installed, uncomment the following line:
# install.packages( "ggplot2" );
library( ggplot2 );
# --------------------------------------------------------------------------
# If you don't have the scales package installed, uncomment the following line:
# install.packages( "scales" );
library( scales );
## Warning: package 'scales' was built under R version 3.5.2
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
# --------------------------------------------------------------------------
# Here is your data from REVIGO. Scroll down for plot configuration options.
revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","plot_size","log10_p_value","uniqueness","dispensability");
revigo.data <- rbind(c("GO:0006468","protein phosphorylation", 4.137, 4.305,-3.360, 5.725,-3.7877,0.400,0.000),
c("GO:0006807","nitrogen compound\nmetabolic process",38.744, 0.501, 5.551, 6.696,-2.2764,0.783,0.034),
c("GO:0006207","'de novo' pyrimidine\nnucleobase biosynthetic process", 0.192,-3.965,-5.039, 4.392,-2.2764,0.459,0.063),
c("GO:0006281","DNA repair", 2.234, 1.011,-6.623, 5.457,-2.4853,0.500,0.201),
c("GO:0006030","chitin metabolic\nprocess", 0.077,-5.068,-0.868, 3.994,-1.6311,0.491,0.211),
c("GO:0006520","cellular amino acid\nmetabolic process", 5.591,-4.136,-3.302, 5.856,-2.2764,0.423,0.351),
c("GO:0006412","translation", 5.686,-0.187,-3.138, 5.863,-2.4853,0.280,0.551),
c("GO:0016567","protein ubiquitination", 0.523, 4.909,-2.319, 4.827,-1.4336,0.440,0.560));
one.data <- data.frame(revigo.data);
names(one.data) <- revigo.names;
one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ];
one.data$plot_X <- as.numeric( as.character(one.data$plot_X) );
one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) );
one.data$plot_size <- as.numeric( as.character(one.data$plot_size) );
one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) );
one.data$frequency <- as.numeric( as.character(one.data$frequency) );
one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) );
one.data$dispensability <- as.numeric( as.character(one.data$dispensability) );
head(one.data);
## term_ID description frequency_%
## 1 GO:0006468 protein phosphorylation 4.137
## 2 GO:0006807 nitrogen compound\nmetabolic process 38.744
## 3 GO:0006207 'de novo' pyrimidine\nnucleobase biosynthetic process 0.192
## 4 GO:0006281 DNA repair 2.234
## 5 GO:0006030 chitin metabolic\nprocess 0.077
## 6 GO:0006520 cellular amino acid\nmetabolic process 5.591
## plot_X plot_Y plot_size log10_p_value uniqueness dispensability frequency
## 1 4.305 -3.360 5.725 -3.7877 0.400 0.000 4.137
## 2 0.501 5.551 6.696 -2.2764 0.783 0.034 38.744
## 3 -3.965 -5.039 4.392 -2.2764 0.459 0.063 0.192
## 4 1.011 -6.623 5.457 -2.4853 0.500 0.201 2.234
## 5 -5.068 -0.868 3.994 -1.6311 0.491 0.211 0.077
## 6 -4.136 -3.302 5.856 -2.2764 0.423 0.351 5.591
# --------------------------------------------------------------------------
# Names of the axes, sizes of the numbers and letters, names of the columns,
# etc. can be changed below
p1 <- ggplot( data = one.data );
p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = plot_size), alpha = I(0.6) ) + scale_size_area();
p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) );
p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = plot_size), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area();
## Scale for 'size' is already present. Adding another scale for 'size', which
## will replace the existing scale.
p1 <- p1 + scale_size( range=c(5, 30)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) );
## Scale for 'size' is already present. Adding another scale for 'size', which
## will replace the existing scale.
#ex <- one.data [ one.data$dispensability < 0.15, ];
p1 <- p1 + geom_text( data = one.data, aes(plot_X, plot_Y, label = as.character(description)), colour = I(alpha("black", 0.85)), size = 3 );
p1 <- p1 + labs (y = "semantic space x", x = "semantic space y");
p1 <- p1 + theme(legend.key = element_blank()) ;
one.x_range = max(one.data$plot_X) - min(one.data$plot_X);
one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y);
p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10);
p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10);
# --------------------------------------------------------------------------
# Output the plot to screen
ggplotly(p1);
# Uncomment the line below to also save the plot to a file.
# The file type depends on the extension (default=pdf).
# ggsave("C:/Users/path_to_your_file/revigo-plot.pdf");